{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generating trading signals with LightGBM and CatBoost"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports & Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:53.361121Z",
     "start_time": "2020-06-21T03:15:53.359422Z"
    }
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:54.473652Z",
     "start_time": "2020-06-21T03:15:53.619170Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "from pathlib import Path\n",
    "import sys, os\n",
    "from time import time\n",
    "from collections import defaultdict\n",
    "from itertools import product\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import lightgbm as lgb\n",
    "from catboost import Pool, CatBoostRegressor\n",
    "\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from scipy.stats import spearmanr\n",
    "\n",
    "from alphalens.tears import (create_summary_tear_sheet,\n",
    "                             create_full_tear_sheet)\n",
    "\n",
    "from alphalens.utils import get_clean_factor_and_forward_returns\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:54.477216Z",
     "start_time": "2020-06-21T03:15:54.474618Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "sys.path.insert(1, os.path.join(sys.path[0], '..'))\n",
    "from utils import MultipleTimeSeriesCV, format_time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:54.489618Z",
     "start_time": "2020-06-21T03:15:54.478409Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "sns.set_style('whitegrid')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:54.626839Z",
     "start_time": "2020-06-21T03:15:54.624975Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "YEAR = 252\n",
    "idx = pd.IndexSlice"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Get Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:58.610627Z",
     "start_time": "2020-06-21T03:15:56.699840Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 1749266 entries, ('A', Timestamp('2010-01-04 00:00:00')) to ('ZION', Timestamp('2016-12-30 00:00:00'))\n",
      "Data columns (total 33 columns):\n",
      " #   Column           Non-Null Count    Dtype  \n",
      "---  ------           --------------    -----  \n",
      " 0   dollar_vol_rank  1749266 non-null  float64\n",
      " 1   rsi              1735336 non-null  float64\n",
      " 2   bb_high          1730361 non-null  float64\n",
      " 3   bb_low           1730359 non-null  float64\n",
      " 4   NATR             1735336 non-null  float64\n",
      " 5   ATR              1735336 non-null  float64\n",
      " 6   PPO              1724391 non-null  float64\n",
      " 7   MACD             1716431 non-null  float64\n",
      " 8   sector           1749266 non-null  int64  \n",
      " 9   r01              1748271 non-null  float64\n",
      " 10  r05              1744291 non-null  float64\n",
      " 11  r10              1739316 non-null  float64\n",
      " 12  r21              1728371 non-null  float64\n",
      " 13  r42              1707476 non-null  float64\n",
      " 14  r63              1686581 non-null  float64\n",
      " 15  r01dec           1748271 non-null  float64\n",
      " 16  r05dec           1744291 non-null  float64\n",
      " 17  r10dec           1739316 non-null  float64\n",
      " 18  r21dec           1728371 non-null  float64\n",
      " 19  r42dec           1707476 non-null  float64\n",
      " 20  r63dec           1686581 non-null  float64\n",
      " 21  r01q_sector      1748271 non-null  float64\n",
      " 22  r05q_sector      1744291 non-null  float64\n",
      " 23  r10q_sector      1739316 non-null  float64\n",
      " 24  r21q_sector      1728371 non-null  float64\n",
      " 25  r42q_sector      1707476 non-null  float64\n",
      " 26  r63q_sector      1686581 non-null  float64\n",
      " 27  r01_fwd          1749266 non-null  float64\n",
      " 28  r05_fwd          1749266 non-null  float64\n",
      " 29  r21_fwd          1749251 non-null  float64\n",
      " 30  year             1749266 non-null  int64  \n",
      " 31  month            1749266 non-null  int64  \n",
      " 32  weekday          1749266 non-null  int64  \n",
      "dtypes: float64(29), int64(4)\n",
      "memory usage: 447.2+ MB\n"
     ]
    }
   ],
   "source": [
    "data = (pd.read_hdf('data.h5', 'model_data')\n",
    "            .sort_index()\n",
    "            .loc[idx[:, :'2016'], :])\n",
    "data.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:58.627071Z",
     "start_time": "2020-06-21T03:15:58.611792Z"
    }
   },
   "outputs": [],
   "source": [
    "labels = sorted(data.filter(like='_fwd').columns)\n",
    "features = data.columns.difference(labels).tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model Selection: Lookback, lookahead and roll-forward periods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:58.713463Z",
     "start_time": "2020-06-21T03:15:58.628189Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "tickers = data.index.get_level_values('symbol').unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:58.716838Z",
     "start_time": "2020-06-21T03:15:58.714860Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "lookaheads = [1, 5, 21]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:58.726739Z",
     "start_time": "2020-06-21T03:15:58.718248Z"
    }
   },
   "outputs": [],
   "source": [
    "categoricals = ['year', 'month', 'sector', 'weekday']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:58.735386Z",
     "start_time": "2020-06-21T03:15:58.728023Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "train_lengths = [int(4.5 * 252), 252]\n",
    "test_lengths = [63, 21]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:58.743174Z",
     "start_time": "2020-06-21T03:15:58.737182Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "test_params = list(product(lookaheads, train_lengths, test_lengths))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:15:58.751241Z",
     "start_time": "2020-06-21T03:15:58.744605Z"
    }
   },
   "outputs": [],
   "source": [
    "results_path = Path('results', 'us_stocks')\n",
    "if not results_path.exists():\n",
    "    results_path.mkdir(parents=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Baseline: Linear Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:16:05.129548Z",
     "start_time": "2020-06-21T03:16:05.127693Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "lr = LinearRegression()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.679350Z",
     "start_time": "2020-06-21T03:16:05.270137Z"
    },
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 1134 63\n",
      "1 1134 21\n",
      "1 252 63\n",
      "1 252 21\n",
      "5 1134 63\n",
      "5 1134 21\n",
      "5 252 63\n",
      "5 252 21\n",
      "21 1134 63\n",
      "21 1134 21\n",
      "21 252 63\n",
      "21 252 21\n"
     ]
    }
   ],
   "source": [
    "lr_metrics = []\n",
    "for lookahead, train_length, test_length in test_params:\n",
    "    print(lookahead, train_length, test_length)\n",
    "    label = f'r{lookahead:02}_fwd'\n",
    "    df = pd.get_dummies(data.loc[:, features + [label]].dropna(), \n",
    "                        columns=categoricals, \n",
    "                        drop_first=True)\n",
    "    X, y = df.drop(label, axis=1), df[label]\n",
    "\n",
    "    n_splits = int(2 * YEAR / test_length)\n",
    "    cv = MultipleTimeSeriesCV(n_splits=n_splits,\n",
    "                              test_period_length=test_length,\n",
    "                              lookahead=lookahead,\n",
    "                              train_period_length=train_length)\n",
    "\n",
    "    ic, preds = [], []\n",
    "    for i, (train_idx, test_idx) in enumerate(cv.split(X=X)):\n",
    "        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]\n",
    "        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]\n",
    "        lr.fit(X_train, y_train)\n",
    "        y_pred = lr.predict(X_test)\n",
    "        preds.append(y_test.to_frame('y_true').assign(y_pred=y_pred))\n",
    "        ic.append(spearmanr(y_test, y_pred)[0])\n",
    "    preds = pd.concat(preds)\n",
    "    lr_metrics.append([\n",
    "        lookahead, train_length, test_length,\n",
    "        np.mean(ic),\n",
    "        spearmanr(preds.y_true, preds.y_pred)[0]\n",
    "    ])\n",
    "\n",
    "columns = ['lookahead', 'train_length', 'test_length', 'ic_by_day', 'ic']\n",
    "lr_metrics = pd.DataFrame(lr_metrics, columns=columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Information Coefficient - Distribution by Lookahead"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.897475Z",
     "start_time": "2020-06-21T03:19:37.680445Z"
    }
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA+gAAAFgCAYAAAAo31N4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3dfVTUdaLH8c8A8jCOSKDXh0wikLKtDdE9p1J7MN2QoykpCXnMm1a3dpXSzTRTM1PESjcpsWMPe43rTcpqN5XYMi1Xtydd2bLUQFfqtkXmaDoODg8z9w/X2XV1HEHn9/vJvF/ndJL5zY/vZxD5zofv78Hm8/l8AgAAAAAApoowOwAAAAAAAKCgAwAAAABgCRR0AAAAAAAsgIIOAAAAAIAFUNABAAAAALAACjoAAAAAABYQZXYAANYxYMAALV68WFdeeaUkacOGDXrppZd06NAhNTY2qkePHpo6daq6dOly0r6XXnqpPvzwQyUmJrZ47DZt2ig2NlY+n08+n0/Z2dm6++67FRXFjyoAQOtwNnOtJH344YcqKSlRbW2tYmNjlZSUpF//+tfq06dPyLN//PHHevzxx7VmzRpNmzZNPXr00Pjx40M+LhBOeNcL4JRWr16tpUuXaunSpUpOTpbP59OyZct0xx13aO3atYqOjj7nYz711FP+Nyxut1sPPvig5s+fr5kzZ57zsQAAMFtz59r33ntPRUVFeuKJJ9SrVy9JUmVlpSZNmqTZs2fr+uuvN+NlADiHOMQdwCn99re/1SOPPKLk5GRJks1m0z333KOJEyeqvr7+lPs8/fTTysnJ0bBhw7RhwwZJ0p133qlXX33V/5ySkhIVFhYGHd9ut2vWrFkqKyuTy+WS2+3WQw89pFGjRunmm2/Wrbfeqj179ujvf/+7MjMzdfjwYUmSz+fTzTffrJ07d57tlwAAgJBq7lz7xBNPaMaMGf5yLkkZGRmaPn26nnjiCR0+fFiZmZnat2+ff3tubq4++OAD1dfXq7CwUDk5Obrllls0bdo0uVwuScdW9R944AENHjxY7777rjZs2KC8vDzdeuutuuGGG/T000+H+CsB4DgKOoCTHDhwQN9++60yMzNPeNxms+mWW26Rw+E45X7dunXTm2++qSeffFLTpk2T0+nU6NGj/QXd6/Vq1apVysvLO6McnTt3lsPh0J49e7Rx40bFx8errKxMf/zjH3XFFVdoxYoV6tq1q66++mq99dZbkqSPPvpICQkJuuyyy87iKwAAQGg1d649cOCA9u7dq1/84hcnfa5rrrlG1dXV8nq9GjRokH9O3L17t3788Uf1799fy5YtU2RkpN544w299dZb+o//+A899dRT/s/Ro0cPvf322xo4cKBeeuklFRUV6Y033lBZWZmWLVsmp9MZgq8CgH/HIe4AThIRcex3d16vt1n75efnS5LS09OVmpqqbdu26cYbb9S8efO0c+dO1dbWqlu3brrkkkvO+HPabDbFxcUpKytLF110kUpLS1VTU6NPPvnEv4IwevRoPfnkkxo9erTKysr8OQAAsKqWzrWNjY0nPdbQ0CDp2JyZm5urxx57TOPHj9frr7+uESNGKCIiQu+//74OHz6sP//5z/59kpKS/J/j+DnsNptNzz33nN5//32tWbNGu3fvls/nU11dXYteJ4DmYQUdwEnat2+viy++WH/9619P2nb//fcHPHz8+JsN6dgbjqioKEVGRmrUqFFatWqVXn/99TNePZekb7/9Vm63W927d9f//u//6pFHHlFsbKyGDh2qIUOGyOfzSZKuvfZa1dXV6cMPP9SWLVs0ePDgZr5iAACM1dy59oILLlBKSoo++eSTk57/0UcfKTU1VfHx8erTp48aGxv12Wefac2aNRoxYoSkY/Py9OnT9Yc//EF/+MMf9Nprr2nx4sX+z2G32yUduwZMTk6OvvjiC11++eV66KGHFBUV5Z9zAYQWBR3AKU2YMEHz5s1TTU2NJKmpqUklJSXauXNnwBXwN998U5L0xRdf6Ouvv9ZVV10l6dj5b+vWrdMXX3yhQYMGndH4hw4d0uOPP67Ro0crJiZGmzZtUk5OjnJzc5WSkqL169erqalJ0rHf9t9+++165JFHNGTIEMXExJztywcAIOSaO9c+/PDDKiwsVGVlpf+xbdu2qaioSA8++KD/sdzcXD3++OO69NJL/VeD79evn1asWKH6+np5vV7NnDlTixYtOmmMmpoauVwuPfDAAxowYIA+/vhj/z4AQo9D3AGc0tChQ+Xz+TR58mQ1NjbK4/HoZz/7mZYvXx7wCu7ffPONhg8fLpvNpkWLFikhIUGSlJSUpCuuuEKpqalq06ZNwDEffPBBxcbGKjIyUk1NTfrlL3+pe++9V5I0btw4zZo1S6tWrZJ07KI4X331lX/fnJwcLViwQKNGjTpXXwIAAEKquXPt9ddfrwULFmjx4sWqra2V1+tV586dtWDBAl199dX+5w0fPlyLFi06oYD/6le/0oIFC5STk6Ompib17NlT06ZNO2mMSy+9VDfccIMGDx6s6OhopaenKy0tTTU1NSG5gwuAE9l8HK8CIMScTqdGjhypFStWBLyv69lau3at3nzzTb3wwgsh+fwAAABAqLGCDiCkXn31VS1atEgTJ04MWTkfM2aMnE6nSkpKQvL5AQAAACOwgg4AAAAAgAVwkTgAAAAAACzAsEPcvV6vZs+erV27dik6Olpz585VcnKyf/uaNWu0fPlyRUZGKj09XbNnz5ak0+4DAAAAAEBrYVhBX7dunerr61VWVqbKykoVFRVp6dKlkqSjR4/q6aef1urVqxUXF6fJkydrw4YNampqCrjP6VRWVnKbJQAAWsDj8SgjI+OMnst8CwBAywSabw0r6Fu3blX//v0lHbs90vbt2/3boqOjtXLlSsXFxUmSGhsbFRMToz/96U8B9zmdmJgY9ezZ8xy/AgAAWr8dO3ac8XOZbwEAaJlA861hBd3lcsnhcPg/joyMVGNjo6KiohQREaEOHTpIkkpLS+V2u9W3b1+9/fbbAfc5HY/H06w3GAAAoPmYbwEAOLcMK+gOh0NHjhzxf+z1ek8o2l6vV08++aT+9re/6ZlnnpHNZgu6TyD8Rh8AgJZhBR0AgNALNN8adhX3zMxMbdy4UdKxc9bS09NP2D5r1ix5PB6VlJT4D3UPtg8AAAAAAK2FYSvogwYN0ubNm5WXlyefz6fCwkKtXr1abrdbV1xxhVatWqU+ffpo7NixkqQ77rjjlPsAAAAAANAaGVbQIyIiNGfOnBMeS01N9f95586dp9zv3/cBAAAAAKA1MuwQdwAAAAAAEBgFHQAAAAAAC6CgAwAAAABgARR0AAAAAAAsgIIOAAAAAIAFUNABAAAAALAAw26zBsAaKioqVF5ebsrYTqdTkpSYmGjK+NnZ2crKyjJlbAAAACAYCjoAw+zfv1+SeQUdAAAAsDIKOhBmsrKyTFtFLigokCQVFxebMj4AAABgZZyDDgAAAACABVDQAQAAAACwAAo6AAAAAAAWQEEHAAAAAMACKOgAAAAAAFgABR0AAAAAAAugoAMAAAAAYAEUdAAAAAAALICCDgAAAACABVDQAQAAAACwAAo6AAAAAAAWQEEHAAAAAMACKOgAAAAAAFgABR0AAAAAAAugoAMAAAAAYAFRZgcAAAAAALRMRUWFysvLTRnb6XRKkhITE00ZPzs7W1lZWaaMHSoUdAAAAABAs+3fv1+SeQW9NaKgAwAAAMB5Kisry7RV5IKCAklScXGxKeO3RpyDDgAAAACABVDQAQAAAACwAAo6AAAAAAAWQEEHAAAAAMACKOgAAAAAAFgAV3EHTFBcXKzq6mqzYxiuqqpK0j+v+BlO0tLSwvJ1AwAA4MxR0AETVFdX66vtf1F3R5PZUQwV77NJko7u/dTkJMb62hVpdgQAAACcByjogEm6O5o0o4/L7BgwwNwtDrMjAAAA4DzAOegAAAAAAFgABR0AAAAAAAugoAMAAAAAYAEUdAAAAAAALICCDgAAAACABVDQAQAAAACwAAo6AAAAAAAWQEEHAAAAAMACoowayOv1avbs2dq1a5eio6M1d+5cJScnn/Ccuro63XnnnZo3b55SU1MlScOHD1e7du0kSd26ddP8+fONigwAAAAAgGEMK+jr1q1TfX29ysrKVFlZqaKiIi1dutS//fPPP9ejjz6q2tpa/2Mej0eSVFpaalRMAAAAAABMYVhB37p1q/r37y9JysjI0Pbt20/YXl9fryVLluihhx7yP7Zz507V1dVp3Lhxamxs1OTJk5WRkRF0LI/Hox07dpzbFwCcQ263m/NLwozb7ebnElod5lsACG9ut1uSmAvOIcMKusvlksPh8H8cGRmpxsZGRUUdi9C7d++T9omNjdX48eOVm5urvXv36u6771ZFRYV/n0BiYmLUs2fPc/sCgHPIbrfrqNkhYCi73c7PJZwXmvMmi/kWAMKb3W6XJOaCFgg03xpW0B0Oh44cOeL/2Ov1Bi3aKSkpSk5Ols1mU0pKihISErRv3z516dIl1HEBAAAAADCUYUfZZmZmauPGjZKkyspKpaenB91n1apVKioqkiTV1tbK5XKpY8eOIc0JAAAAAIAZDFtBHzRokDZv3qy8vDz5fD4VFhZq9erVcrvdGjVq1Cn3GTlypB5++GHl5+fLZrOpsLAw6Ko7AAAAAADnI8PabkREhObMmXPCY8dvpfav/vWK7dHR0Vq4cGHIswEAzr2KigqVl5ebMrbT6ZQkJSYmmjJ+dna2srKyTBkbAACcv1iOBgC0Ovv375dkXkEHAABoCQo6ACAksrKyTFtFLigokCQVFxebMj4AAEBLcCtmAAAAAAAsgIIOAAAAAIAFUNABAAAAALAACjoAAAAAABZAQQcAAAAAwAIo6AAAAAAAWAAFHQAAAAAAC6CgAwAAAABgARR0AAAAAAAsgIIOAAAAAIAFRJkdAAAQOsXFxaqurjY7huGqqqokSQUFBSYnMV5aWlpYvm4AAFoDCjoAtGLV1dXa9sU2KcHsJAb7x/Fh277dZm4Oox00OwAAADgbFHQAaO0SJO8NXrNTwAAR73PmGgAA5zNmcgAAAAAALICCDgAAAACABVDQAQAAAACwAAo6AAAAAAAWQEEHAAAAAMACKOgAAAAAAFgABR0AAAAAAAugoAMAAAAAYAEUdAAAAAAALICCDgAAAACABVDQAQAAAACwgCizAwAAQsfpdEoHpYj3+X1sWDgoOeOcZqcAAAAtxDs2AAAAAAAsgBV0wAROp1P7Dkdq7haH2VFggJrDkeroNGdVMzExUTV1NfLe4DVlfBgr4v0IJSYmmh0DAAC0EAUdAAAAAM5CcXGxqqurzY5huKqqKklSQUGByUmMl5aWFpLXTUEHTJCYmCj7od2a0cdldhQYYO4Wh2JZ1QQAoNWqrq7Wti+2SQlmJzHYP06Y3vbtNnNzGO1g6D41BR0AAAAAzlaCOKUsTITy4rtcJA4AAAAAAAugoAMAAAAAYAEUdAAAAAAALICCDgAAAACABVDQAQAAAACwAAo6AAAAAAAWQEEHAAAAAMACKOgAAAAAAFgABR0AAAAAAAsIWtA3bNhwwsfl5eUhCwMAAAAAQLiKCrRhw4YN+stf/qK1a9dq27ZtkqSmpiatX79e2dnZzR7I6/Vq9uzZ2rVrl6KjozV37lwlJyef8Jy6ujrdeeedmjdvnlJTU89oHwAAAAAAWoOABf2yyy7TwYMHFRMTo5SUFEmSzWbTkCFDWjTQunXrVF9fr7KyMlVWVqqoqEhLly71b//888/16KOPqra29oz3AQCcgYNSxPthdkbT0X/8P9bUFMY7KOlCs0MAAICWCljQu3TpopycHA0bNkwREWf/xm7r1q3q37+/JCkjI0Pbt28/YXt9fb2WLFmihx566Iz3AQCcXlpamtkRTFFVVSVJ6nFhD5OTGOzC8P07BwCgNQhY0I97/vnn9fzzzys29p/LEJs2bWr2QC6XSw6Hw/9xZGSkGhsbFRV1LELv3r2bvU8gHo9HO3bsaHZGwChut5srNIYZt9ttys+lQYMGadCgQYaPa7aFCxdKku677z6Tk5jDqO815lsAOMbtdpsdAQYL1Xu7oAW9vLxcf/rTnxQXF3dWAzkcDh05csT/sdfrDVq0W7KPJMXExKhnz54tDwuEmN1u9x+Bi/Bgt9v5uWQgu90uSXzNW6A5bzaYbwHgGLvdLh0wOwWMdLbv7QLNt0EX8S688MITVs9bKjMzUxs3bpQkVVZWKj09PST7AAAAAABwPgq6HN3Q0KChQ4cqPT1dNptN0j8PHWyOQYMGafPmzcrLy5PP51NhYaFWr14tt9utUaNGnfE+AAAAAAC0RkEL+t13331OBoqIiNCcOXNOeCw1NfWk55WWlp52HwAAAAAAWqOgh7hffvnl2rx5s37/+9/r4MGD6tSpkxG5AAAAAAAIK0EL+vTp03XRRRdp79696tChgx555BEjcgEAAAAAEFaCFvSDBw9q5MiRioqKUmZmpnw+nxG5AAAAAAAIK2d0K+bdu3dLkr7//ntFRHD3ZgAAAAAAzrWgbfuRRx7R9OnT9eWXX6qgoEDTpk0zIhcAAAAAAGEl6FXcL730UpWVlRmRBQAAAACAsBWwoBcUFKi4uFj9+vU7adumTZtCGgoAAAAAgHATsKAXFxdLOlbG3W637Ha7amtruc0aAAAAAAAhEPQc9GeffdZf1ufNm6dly5aFPBQAAAAAAOEmaEFfv369/8JwxcXFWr9+fchDAQAAAAAQboIWdJvNpvr6eklSQ0MD90EHAAAAACAEgl7FPS8vT0OHDlV6err27Nmju+66y4hcAAAAAACElaAFPTc3VzfddJO++eYbXXTRRUpMTDQiFwAAAAAAYSVgQS8pKdGvfvUrTZ48WTab7YRtCxcuDHkwAAAAAADCScCC7nA4JEnDhw9XbGysYYEAAAAAAAhHAQv6W2+9pZEjR+r555/XSy+9xMXhAAAAAAAIoYAFvW/fvho+fLi+//57ZWVlSZJ8Pp9sNpvee+89wwICAAAAABAOAhb0jh076p133tGzzz6rCRMmGJkJAAAAAICwE7Cgl5WVqVu3bnr33XfVq1evEw5x79evnyHhAAAAAAAIFwEL+v33369169Zp//79WrNmzQnbKOjA2fvaFam5WxxmxzDUT/XH7gjRPjq8rmnxtStS6WaHMEFFRYXKy8tNGbuqqkqSVFBQYMr42dnZ/tPDAAAAzlTAgj5w4EANHDhQ69ev14ABA/TTTz8pPj7+pFuuAWi+tLQ0syOY4pt/lKZOF/cwOYmx0hW+f+dmSUpKMjsCAABAswUs6Mc5HA4NGTJETU1NysrKUteuXZWbm2tEtrBi5kqT0+mUJCUmJpoyfjiuNJm1qme246+7uLjY5CQwQlZWVtj92wYAADgbEcGesHjxYv3P//yPOnTooHvvvVevvPKKEblgoP3792v//v1mxwAAAACAsBZ0BT0iIkIJCQmy2WyKiYlR27ZtjcgVdsxcaWJVEwAAAADMF3QFvXv37lq4cKEOHDigZcuWqWvXrkbkAgAAAAAgrARdQX/sscf02muvqU+fPrLb7Xr88ceNyAUgRLiyNudEAwAAwJqCFnSbzSav1yufz6empiYjMgFopbiyNgAAABBY0II+c+ZMxcfHq1+/fvrkk080Y8YMPfHEE0ZkAxACXFkbAAAAsKagBb2mpkYrVqyQdOze6Hl5eSEPBQAAAABAuAl6kTiPx6O6ujpJ0tGjRznMHQAAAACAEAi6gn7HHXdo2LBh6tGjh6qrq027uBMAAAAAAK1Z0IJ+yy236LrrrtM333yjbt266YILLjAiFwAAAAAAYSXgIe4ul0u/+c1v5HK5lJCQoJqaGs2ZM0cul8vIfAAAAAAAhIWABf3RRx/VlVdeqbZt20o6duXnK664QrNnzzYqGwAAAAAAYSNgQf/uu+/0n//5n7LZbJKkqKgojR8/Xt98841h4QAAAAAACBcBz0GPiDh1d2/Tpk3IwpituLhY1dXVZscwXFVVlSSF5QUA09LSwvJ1AwAAALCegAU9OTlZ69at08CBA/2Pvffee+rYsaMhwcxQXV2tbZ9/Ka890ewohrI1Hfs22Lr7e5OTGCvC7TQ7AgAAAAD4BSzoU6dO1eTJk7VkyRJ169ZN3333nRITE/XEE08Ymc9wXnuijl4+xOwYMEDsl2vMjgAAAAAAfgELenx8vF544QX9/e9/1w8//KAuXbqoU6dORmYDAAAAzksVFRUqLy83ZWyn89hRgomJ5hwVmp2draysLFPGBs53Qe+D3rVrV3Xt2tWILAAAAADO0v79+yWZV9ABtFzQgg4AAACgebKyskxbRT5+Adzi4mJTxgfQcgFvswYAAAAAAIwTdAV98+bN+t3vfqf6+nr/Yy+//HKzB/J6vZo9e7Z27dql6OhozZ07V8nJyf7t69ev15IlSxQVFaURI0botttukyQNHz5c7dq1kyR169ZN8+fPb/bYAAAAAABYXdCCPn/+fE2fPl2dO3c+q4HWrVun+vp6lZWVqbKyUkVFRVq6dKkkqaGhQfPnz9eqVasUFxen/Px83XjjjYqPj5cklZaWntXYAAAAAABYXdCC3qVLF1177bVnPdDWrVvVv39/SVJGRoa2b9/u37Z79251795d7du3lyT17t1bW7ZsUdeuXVVXV6dx48apsbFRkydPVkZGxllnAQAAAADAaoIW9KSkJM2aNUuXX365bDabJGnUqFHNHsjlcsnhcPg/joyMVGNjo6KiouRyufyHsUtS27Zt5XK5FBsbq/Hjxys3N1d79+7V3XffrYqKCkVFnT62x+PRjh07mp3R7XY3ex+c39xud4u+VwAALZ9vAYTW8fe0/Ps0Dj0i/ISqRwQt6N26dZMk/fjjj2c1kMPh0JEjR/wfe71ef9H+921HjhxRu3btlJKSouTkZNlsNqWkpCghIUH79u1Tly5dTjtWTEyMevbs2eyMdrtd0qFm74fzl91ub9H3CgC0Vs15s9HS+RZAaB17Tyv+fRrIbrdLB8xOASOdbY8INN8GvYr7hAkTdMUVVygmJkaXXXaZJkyY0KIAmZmZ2rhxoySpsrJS6enp/m2pqamqqanRwYMHVV9fry1btqhXr15atWqVioqKJEm1tbVyuVzq2LFji8YHAAAAAMDKgq6gL1y4UDU1NcrMzNTvf/97bd26VVOnTm32QIMGDdLmzZuVl5cnn8+nwsJCrV69Wm63W6NGjdK0adM0fvx4+Xw+jRgxQp06ddLIkSP18MMPKz8/XzabTYWFhUEPbwcAAAAA4HwUtO1++umnWrlypSRp7Nix/tufNVdERITmzJlzwmOpqan+Pw8YMEADBgw4YXt0dLQWLlzYovFawul0KsK9X7FfrjFsTJgnwr1fTme02TEAAAAAQNIZFPTGxkZ5vV5FRETI5/P5LxQHAAAAWFlxcbGqq6vNjmG4qqoqSVJBQYHJSYyXlpYWlq8brUfQgp6dna38/HxdddVV+uyzz5SdnW1ELlMkJibqbwfqdfTyIWZHgQFiv1yjxMREs2MAAIAQqa6u1lfb/6Lujiazoxgq3ndsQe3o3k9NTmKsr12RZkcAzlrQgj5u3Dj169dPe/bs0ciRI0+4uBsAAABgZd0dTZrRx2V2DBhg7hZH8CeFiNPplA5KEe8HvQY3WoODkjPOGZJPHbCgv/baa8rNzdXChQv9h7V/+eWXkqTJkyeHJAwAAAAAAOEqYEHv3LmzJOmSSy454XHOQQcAAACAf0pMTFRNXY28N3jNjgIDRLwfEbJTZQMeg9G/f39J0ueff66cnBz/f3/+859DEgQAAAAAgHAWcAV9xYoVWrp0qX766Se98847/sf/9dZoAAAAAADg3AhY0EePHq3Ro0frueee07333mtkJgAAAAAAwk7Qq7jn5eVpzZo1amxslM/n0w8//KD/+q//MiIbAAAAAABhI2hBLygo0MUXX6yvvvpKMTExiouLMyIXAAAAAABhJWhBl6Q5c+bo4Ycf1rx58zR69OhQZzJVhNup2C/XmB3DULaGOkmSr014/fIlwu2U1NnsGAAAAAAg6QwLusfjUV1dnWw2m9xud6gzmSYtLc3sCKaoqqqSJPVIDbey2jls/84BAAAAWE/Qgj569Gj993//t/r27avrr79evXv3NiKXKQoKCsyOYIrjr7u4uNjkJAAAAAAQvoIW9Jtvvtn/58GDB8vhcIQ0EAAAAAAA4ShoQV+5cqVWrlyp+vp6/2Pl5eUhDQUAAAAAQLgJWtBffvllLVu2TO3btzciDwAAAAAAYSloQb/00kvVpUsXRUZGGpEHAAAAAICwFLSgX3311Ro4cKAuuugi+Xw+2Ww2vfzyy0ZkCysVFRWmnTpw/CruZl0kLzs7W1lZWaaMDQAAAABWEbSgl5WV6emnn1a7du2MyAMTJCUlmR0BAAAAAMJe0ILeqVMnXXnllYqIiDAiT9jKyspiFRkAgFbsxx9/1GOPPabZs2fzy3EAwCkFLej19fUaNmyYevToIZvNJklauHBhyIMBAAC0JsuXL9dnn32m5cuXa/LkyWbHAQBYUNCCnp+fr/j4eCOyAAAAtEo//vij3n77bfl8Pr399tsaO3Ysq+gAgJMELegvvviiXnnlFSOyAAAAtErLly+Xz+eTJHm9XlbRAQCnFPTE8vbt22v58uXauHGjNm3apE2bNhmRCwAAoNV499131dDQIElqaGjQO++8Y3IiAIAVBV1Bv+CCC7Rz507t3LnT/1i/fv1CGgoAAKA1GTRokMrLy9XQ0KA2bdrol7/8pdmRAAAWFLSgz58/X1999ZWqq6uVkpKinj17GpELAACg1Rg7dqzefvttSVJERITGjh1rciIAgBUFPcS9tLRUM2fO1LZt2zRz5ky9+OKLRuQCAABoNTp06KDBgwfLZrNp8ODBXCAOAHBKQVfQ16xZoxUrVigqKkoNDQ3Ky8vT+PHjjcgGAADQaowdO1Z79+5l9RwAEFDQgu7z+RQVdexpbdq0UZs2bUIeCgAAoLXp0KGDnnnmGbNjAAAsLGhBz8zMVEFBgXr37q2tW+fCRMQAABBmSURBVLeqV69eRuQCAAAAACCsBDwH/dNPP5UkTZo0SbfeeqsaGxt16623aurUqYaFAwAAAAAgXAQs6AsWLJDb7dZdd92lvn37asyYMbr22mtVX19vZD4AAAAAAMJCwEPc+/btq+HDh+v7779XVlaWpGPno9tsNr333nuGBQQAAABawul0at/hSM3d4jA7CgxQczhSHZ1Os2MAZyVgQZ80aZImTZqkJUuW6Ne//rWRmQAAAAAACDtBLxKXk5Oj559/Xh6Px//YhAkTQhoKAAAAOFuJiYmyH9qtGX1cZkeBAeZucSg2MdHsGMBZCXgO+nEPPPCAXC6XOnTo4P8PAAAAAACcW0FX0Nu2batJkyYZkQUAAAAAgLAVtKD36NFDa9euVc+ePWWz2SRJKSkpIQ8GAAAAAEA4CVrQd+zYoR07dvg/ttlsevnll0MaCgAAAACAcBO0oJeWlhqRAwAAAACAsBawoI8aNcp/SPu/W7lyZcgCAQAAAAAQjgIW9EWLFhmZAwAAAACAsBawoF944YXndCCv16vZs2dr165dio6O1ty5c5WcnOzfvn79ei1ZskRRUVEaMWKEbrvttqD7AACA809xcbGqq6tNGdvpdGr//v2mjG22pKQkJZp0j+i0tDQVFBSYMjYAnE+CnoN+rqxbt0719fUqKytTZWWlioqKtHTpUklSQ0OD5s+fr1WrVikuLk75+fm68cYbtW3btoD7AACA81N1dbW2ff6lvHbjy6KtoU62hnrDx7WCwz/8pL8dMP61R7idho8JAOcrwwr61q1b1b9/f0lSRkaGtm/f7t+2e/dude/eXe3bt5ck9e7dW1u2bFFlZWXAfQAAwPnLa0/U0cuHmB0DBoj9co3ZEQDgvGFYQXe5XHI4HP6PIyMj1djYqKioKLlcLrVr186/rW3btnK5XKfd53Q8Hs8Jt4YDAADnXkvnW7fbHYI0sDK3223KezO3260Iw0eFmcz8XkN4CdX3mmEF3eFw6MiRI/6PvV6vv2j/+7YjR46oXbt2p93ndGJiYtSzZ89zmB4AgPDQnDcbLZ1v7Xa7pEPN3g/nL7vdbsp7M7vdrqOGjwozmfm9pgOGDwsTne33WqD51rBfKmZmZmrjxo2SpMrKSqWnp/u3paamqqamRgcPHlR9fb22bNmiXr16nXYfAAAAAABaE8NW0AcNGqTNmzcrLy9PPp9PhYWFWr16tdxut0aNGqVp06Zp/Pjx8vl8GjFihDp16nTKfQAAAAAAaI0MK+gRERGaM2fOCY+lpqb6/zxgwAANGDAg6D4AAAAAALRGXDcDAAAAAAALMGwFHQAAQJKcTqci3Pu5/VaYiHDvl9MZbXYMADgvsIIOAAAAAIAFsIIOAAAMlZiYqL8dqNfRy4eYHQUGiP1yjRITE82OAQDnBVbQAQAAAACwAFbQAQCA4SLczrA7B93WUCdJ8rWJMzmJsSLcTkmdzY4BAOcFCjoAADBUWlqa2RFMUVVVJUnqkRpuZbVz2P6dA0BzUdABAIChCgoKzI5giuOvu7i42OQkAACr4hx0AAAAAAAsgIIOAAAAAIAFcIg7AAAIGxUVFSovLzdl7OPnoJt1iH92draysrJMGRsAcGYo6AAAAAZISkoyOwIAwOIo6AAAIGxkZWWxihxmvnZFau4Wh9kxDPVTvU2S1D7aZ3ISY33tilS62SGAs0RBBwAAQKsUrrd3++Yfp1N0uriHyUmMla7w/TtH60FBBwAAQKvELf24pR9wvuEq7gAAAAAAWAAFHQAAAAAAC6CgAwAAAABgAZyDDgAAAJxjFRUVKi8vN2Xsqn9cJM6sc/Czs7O5WwLQQhR0AAAAoBVJSkoyOwKAFqKgAwAAAOdYVlYWq8gAmo1z0AEAAAAAsAAKOgAAAAAAFsAh7gAAAABwtg5KEe+H2frn0X/8P9bUFMY7KOnC0HxqCjoAAAAAnIW0tDSzI5ji+B0DelzYw+QkBrswdH/nFHQAAAAAOAtm3dLObMdfd3FxsclJWo8wOwYDAAAAAABroqADAAAAAGABFHQAAAAAACyAgg4AAAAAgAVQ0AEAAAAAsAAKOgAAAAAAFkBBBwAAAADAAijoAAAAAABYAAUdAAAAAAALoKADAAAAAGABFHQAAAAAACyAgg4AAAAAgAVQ0AEAAAAAsAAKOgAAAAAAFkBBBwAAAADAAijoAAAAAABYAAUdAAAAAAALiDJqoKNHj2rKlCnav3+/2rZtqwULFigxMfGE57z66qtauXKloqKidN999+nGG2+Uz+fTddddp4svvliSlJGRod/85jdGxQYAAAAAwBCGFfRXXnlF6enpmjhxotauXauSkhLNmDHDv33fvn0qLS3V66+/Lo/Ho9tvv119+/bVd999p5/97Gd67rnnjIoKAAAAAIDhDCvoW7du1V133SVJuu6661RSUnLC9s8++0y9evVSdHS0oqOj1b17d+3cuVP/93//p9raWo0ZM0axsbF6+OGHdckll5x2LI/Hox07doTstQAAAOZbAAh3brdbkpgLzqGQFPTXXntNy5cvP+GxpKQktWvXTpLUtm1bHT58+ITtLpfLv/34c1wulzp27Kh77rlHgwcP1pYtWzRlyhS9/vrrpx0/JiZGPXv2PEevBgCA8NGcN1nMtwAQ3ux2uyQxF7RAoPk2JAU9NzdXubm5Jzw2YcIEHTlyRJJ05MgRxcfHn7Dd4XD4tx9/Trt27ZSWlqbIyEhJUp8+fVRbWyufzyebzRaK6AAAAAAAmMKwq7hnZmbqgw8+kCRt3LhRvXv3PmH7z3/+c23dulUej0eHDx/W7t27lZ6ermeffda/Gr9z50517dqVcg4AAAAAaHUMOwc9Pz9fU6dOVX5+vtq0aaOFCxdKkn73u9+pe/fuuummmzRmzBjdfvvt8vl8mjRpkmJiYnTPPfdoypQp+uCDDxQZGan58+cbFRkAAAAAAMMYVtDj4uJUXFx80uN33nmn/8+33XabbrvtthO2t2/fXsuWLQt5PgAAAAAAzGTYIe4AAAAAACAwCjoAAAAAABZAQQcAAAAAwAIo6AAAAAAAWAAFHQAAAAAAC6CgAwAAAABgARR0AAAAAAAsgIIOAAAAAIAFUNABAAAAALAACjoAAAAAABZAQQcAAAAAwAIo6AAAAAAAWAAFHQAAAAAAC6CgAwAAAABgARR0AAAAAAAsgIIOAAAAAIAFUNABAAAAALAACjoAAAAAABZAQQcAAAAAwAIo6AAAAAAAWAAFHQAAAAAAC6CgAwAAAABgAVFmBwAAAAAAtExFRYXKy8tNGbuqqkqSVFBQYMr42dnZysrKMmXsUKGgAwAAAACaLSkpyewIrQ4FHQAAAADOU1lZWa1uFTmccQ46AAAAAAAWQEEHAAAAAMACKOgAAAAAAFgABR0AAAAAAAugoAMAAAAAYAEUdAAAAAAALICCDgAAAACABVDQAQAAAACwAAo6AAAAAAAWQEEHAAAAAMACKOgAAAAAAFgABR0AAAAAAAuIMjtAKHg8Hu3YscPsGAAAnHc8Hk+znst8CwBA8wWab20+n89ncBYAAAAAAPBvOMQdAAAAAAALoKADAAAAAGABFHQAAAAAACyAgg4AAAAAgAVQ0AEAAAAAsAAKOgAAAAAAFtAq74OO5vvrX/+qp556SqWlpWZHQSs2fPhwtWvXTpLUrVs3zZ8/3+REaC0aGho0ffp0ffvtt6qvr9d9992nm266SZJUWFiolJQU5efnm5wS4Y65FkZhvkWoMN+GHgUdev755/XWW28pLi7O7ChoxTwejyTxxhQh8dZbbykhIUFPPvmkDhw4oJycHPXq1UsPPfSQ9u7dq/Hjx5sdEWGOuRZGYb5FKDHfhh6HuEPdu3fXM888Y3YMtHI7d+5UXV2dxo0bpzvuuEOVlZVmR0IrkpWVpfvvv9//cWRkpI4cOaKJEydq2LBhJiYDjmGuhVGYbxFKzLehR0GHbr75ZkVFcTAFQis2Nlbjx4/Xiy++qMcee0wPPvigGhsbzY6FVqJt27ZyOBxyuVwqKCjQAw88oIsuukhXXXWV2dEAScy1MA7zLUKJ+Tb0KOgADJGSkqJbbrlFNptNKSkpSkhI0L59+8yOhVbku+++0x133KFhw4Zp6NChZscBAFMw3yLUmG9Di4IOwBCrVq1SUVGRJKm2tlYul0sdO3Y0ORVaix9//FHjxo3TlClTNHLkSLPjAIBpmG8RSsy3oUdBB2CIkSNH6vDhw8rPz9ekSZNUWFjI4Z44Z5577jkdOnRIJSUlGjNmjMaMGaOjR4+aHQsADMd8i1Bivg09m8/n85kdAgAAAACAcMcKOgAAAAAAFkBBBwAAAADAAijoAAAAAABYAAUdAAAAAAALoKADAAAAAGABFHQAp/TGG2/oqaeeatY+AwYMkMfjCfq8jz/+WJMmTWpptJNs3LhR06ZNO2efDwAAozDfAvhXFHQAAAAAACwgyuwAAKztpZde0tq1axUVFaU+ffpoypQpOnTokKZMmSKXy6Wmpibdf//9uuaaa/z7vPLKK9q8ebMWLVqk9evXa8WKFf5tixcvliTV1NTorrvuktPp1I033qiJEydq165dmjt3riQpISFBhYWFstvtmjVrlr7//nsdOHBA1113nR544AHt3r1b06dPV1xcnOLi4tS+fXtjvzAAAJxDzLcAJAo6gNOoqanRxx9/rJUrVyoqKkoTJ07Uhg0b9Mknn+jaa6/V2LFjVVtbq/z8fK1bt06SVFpaqh07dmjx4sWKjIzU3r17tWzZMsXFxWnWrFnatGmTOnXqJI/Ho5KSEjU1NemGG27QxIkTNXPmTBUWFiotLU2vvfaaXnjhBeXm5iojI0O5ubnyeDz+NwyLFy9WQUGB+vbtq2XLlmnPnj0mf7UAAGgZ5lsAx1HQAQS0Y8cO3XDDDWrTpo0kqU+fPqqqqtLu3bs1dOhQSVKnTp3kcDjkdDolSR9++KEiIyMVGRkpSUpKStLUqVPVtm1b7dmzRxkZGZKkHj16KDo6WpIUFXXsR9Hu3bv12GOPSZIaGhqUkpKihIQEff755/roo4/kcDhUX18vSaqqqtLPf/5zSVJmZiZvGAAA5y3mWwDHcQ46gIB69uypzz77TI2NjfL5fPr000+VkpKi1NRUbdmyRZJUW1urQ4cOKSEhQZJUUlKi+Ph4vfLKKzp8+LCKi4v129/+VnPnzlVMTIx8Pp8kyWaznTReSkqKFixYoNLSUk2ZMkXXX3+93njjDbVr104LFy7UuHHjdPToUfl8Pl1yySXatm2bJGn79u0GfUUAADj3mG8BHMcKOoCAkpOTlZmZqfz8fHm9XvXu3VsDBw7UL37xC02fPl1//OMfdfToUc2ZM8f/W3lJmjFjhnJzc3XNNdcoMzNTOTk5stvtio+P1w8//KBu3bqdcrzZs2dr6tSpampqkiTNmzdPqampmjx5srZu3aq4uDglJyfrhx9+0KOPPqpJkybpxRdfVGJiomJiYgz5mgAAcK4x3wI4zuY7/us1AAAAAABgGg5xBwAAAADAAijoAAAAAABYAAUdAAAAAAALoKADAAAAAGABFHQAAAAAACyAgg4AAAAAgAVQ0AEAAAAAsID/B7c+/IDsOS/iAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 1008x360 with 2 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "fig, axes =plt.subplots(ncols=2, figsize=(14,5), sharey=True)\n",
    "sns.boxplot(x='lookahead', y='ic_by_day',data=lr_metrics, ax=axes[0])\n",
    "axes[0].set_title('IC by Day')\n",
    "sns.boxplot(x='lookahead', y='ic',data=lr_metrics, ax=axes[1])\n",
    "axes[1].set_title('IC Overall')\n",
    "axes[0].set_ylabel('Information Coefficient')\n",
    "axes[1].set_ylabel('')\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Best Train/Test Period Lengths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.911949Z",
     "start_time": "2020-06-21T03:19:37.898655Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>lookahead</th>\n",
       "      <th>train_length</th>\n",
       "      <th>test_length</th>\n",
       "      <th>ic_by_day</th>\n",
       "      <th>ic</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>252</td>\n",
       "      <td>21</td>\n",
       "      <td>0.072497</td>\n",
       "      <td>-0.008860</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1134</td>\n",
       "      <td>21</td>\n",
       "      <td>0.054182</td>\n",
       "      <td>-0.004547</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>252</td>\n",
       "      <td>63</td>\n",
       "      <td>0.036205</td>\n",
       "      <td>-0.016078</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>5</td>\n",
       "      <td>252</td>\n",
       "      <td>21</td>\n",
       "      <td>0.188260</td>\n",
       "      <td>-0.023782</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>5</td>\n",
       "      <td>1134</td>\n",
       "      <td>21</td>\n",
       "      <td>0.153993</td>\n",
       "      <td>-0.016251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>1134</td>\n",
       "      <td>63</td>\n",
       "      <td>0.076561</td>\n",
       "      <td>0.055373</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>21</td>\n",
       "      <td>1134</td>\n",
       "      <td>21</td>\n",
       "      <td>0.144771</td>\n",
       "      <td>-0.012503</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>21</td>\n",
       "      <td>252</td>\n",
       "      <td>21</td>\n",
       "      <td>0.126590</td>\n",
       "      <td>-0.078876</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>21</td>\n",
       "      <td>1134</td>\n",
       "      <td>63</td>\n",
       "      <td>0.108476</td>\n",
       "      <td>0.096012</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    lookahead  train_length  test_length  ic_by_day        ic\n",
       "3           1           252           21   0.072497 -0.008860\n",
       "1           1          1134           21   0.054182 -0.004547\n",
       "2           1           252           63   0.036205 -0.016078\n",
       "7           5           252           21   0.188260 -0.023782\n",
       "5           5          1134           21   0.153993 -0.016251\n",
       "4           5          1134           63   0.076561  0.055373\n",
       "9          21          1134           21   0.144771 -0.012503\n",
       "11         21           252           21   0.126590 -0.078876\n",
       "8          21          1134           63   0.108476  0.096012"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(lr_metrics.groupby('lookahead', group_keys=False)\n",
    " .apply(lambda x: x.nlargest(3, 'ic_by_day')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.920850Z",
     "start_time": "2020-06-21T03:19:37.913179Z"
    }
   },
   "outputs": [],
   "source": [
    "lr_metrics.to_csv(results_path / 'lin_reg_metrics.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LightGBM Model Tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.935808Z",
     "start_time": "2020-06-21T03:19:37.921761Z"
    }
   },
   "outputs": [],
   "source": [
    "def get_fi(model):\n",
    "    fi = model.feature_importance(importance_type='gain')\n",
    "    return (pd.Series(fi / fi.sum(),\n",
    "                      index=model.feature_name()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Hyperparameter Options"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.943601Z",
     "start_time": "2020-06-21T03:19:37.936926Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "base_params = dict(boosting='gbdt',\n",
    "                   objective='regression',\n",
    "                   verbose=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.952308Z",
     "start_time": "2020-06-21T03:19:37.944671Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "learning_rate_ops = [.01, .1, .3]\n",
    "max_depths = [2, 3, 5, 7]\n",
    "num_leaves_opts = [2 ** i for i in max_depths]\n",
    "feature_fraction_opts = [.3, .6, .95]\n",
    "min_data_in_leaf_opts = [250, 500, 1000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.960258Z",
     "start_time": "2020-06-21T03:19:37.953744Z"
    }
   },
   "outputs": [],
   "source": [
    "param_names = ['learning_rate', 'num_leaves',\n",
    "               'feature_fraction', 'min_data_in_leaf']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.969006Z",
     "start_time": "2020-06-21T03:19:37.961383Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# Parameters: 108\n"
     ]
    }
   ],
   "source": [
    "cv_params = list(product(learning_rate_ops,\n",
    "                         num_leaves_opts,\n",
    "                         feature_fraction_opts,\n",
    "                         min_data_in_leaf_opts))\n",
    "n_params = len(cv_params)\n",
    "print(f'# Parameters: {n_params}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train/Test Period Lengths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.977749Z",
     "start_time": "2020-06-21T03:19:37.969903Z"
    }
   },
   "outputs": [],
   "source": [
    "lookaheads = [1, 5, 21]\n",
    "label_dict = dict(zip(lookaheads, labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.986465Z",
     "start_time": "2020-06-21T03:19:37.978744Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "train_lengths = [int(4.5 * 252), 252]\n",
    "test_lengths = [63]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:37.995277Z",
     "start_time": "2020-06-21T03:19:37.987379Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train configs: 6\n"
     ]
    }
   ],
   "source": [
    "test_params = list(product(lookaheads, train_lengths, test_lengths))\n",
    "n = len(test_params)\n",
    "test_param_sample = np.random.choice(list(range(n)), size=int(n), replace=False)\n",
    "test_params = [test_params[i] for i in test_param_sample]\n",
    "print('Train configs:', len(test_params))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Categorical Variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:38.043537Z",
     "start_time": "2020-06-21T03:19:37.996178Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "categoricals = ['year', 'weekday', 'month']\n",
    "for feature in categoricals:\n",
    "    data[feature] = pd.factorize(data[feature], sort=True)[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Custom Loss Function: Information Coefficient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:38.046578Z",
     "start_time": "2020-06-21T03:19:38.044405Z"
    }
   },
   "outputs": [],
   "source": [
    "def ic_lgbm(preds, train_data):\n",
    "    \"\"\"Custom IC eval metric for lightgbm\"\"\"\n",
    "    is_higher_better = True\n",
    "    return 'ic', spearmanr(preds, train_data.get_label())[0], is_higher_better"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Cross-Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:38.054547Z",
     "start_time": "2020-06-21T03:19:38.047670Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "lgb_store = Path(results_path / 'tuning_lgb.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:38.070171Z",
     "start_time": "2020-06-21T03:19:38.055637Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "labels = sorted(data.filter(like='fwd').columns)\n",
    "features = data.columns.difference(labels).tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:38.078077Z",
     "start_time": "2020-06-21T03:19:38.071079Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "label_dict = dict(zip(lookaheads, labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:38.086476Z",
     "start_time": "2020-06-21T03:19:38.078921Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "num_iterations = [10, 25, 50, 75] + list(range(100, 501, 50))\n",
    "num_boost_round = num_iterations[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T03:19:38.094204Z",
     "start_time": "2020-06-21T03:19:38.087354Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "metric_cols = (param_names + ['t', 'daily_ic_mean', 'daily_ic_mean_n',\n",
    "                              'daily_ic_median', 'daily_ic_median_n'] +\n",
    "               [str(n) for n in num_iterations])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.000694Z",
     "start_time": "2020-06-21T03:19:38.095078Z"
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Lookahead:  1 | Train: 1134 | Test: 63 | Params:  54 | Train configs: 6\n",
      "  0 | 00:20:11 (1211) |  0.30 |   4 | 95% | 1000 |   2.20% |  1.80% |  400 |  2.00% |  200\n",
      "  1 | 00:32:43 (753) |  0.01 |   4 | 60% |  500 |   1.32% |  0.75% |   75 |  1.00% |  100\n",
      "  2 | 00:45:20 (756) |  0.10 |   4 | 60% |  500 |   2.78% |  1.39% |  500 |  1.40% |  350\n",
      "  3 | 02:41:26 (6967) |  0.10 | 128 | 95% |  250 |   3.81% |  1.47% |  100 |  1.61% |  350\n",
      "  4 | 03:00:57 (1170) |  0.01 |   8 | 30% |  500 |   1.29% |  1.25% |  500 |  0.94% |  400\n",
      "  5 | 03:21:13 (1217) |  0.10 |   8 | 30% |  250 |   2.39% |  1.87% |  500 |  1.54% |  200\n",
      "  6 | 05:25:10 (7437) |  0.10 | 128 | 95% |  500 |   3.17% |  1.77% |  200 |  1.76% |  300\n",
      "  7 | 07:04:44 (5974) |  0.10 | 128 | 60% |  250 |   3.13% |  1.60% |  150 |  1.66% |  350\n",
      "  8 | 07:24:02 (1158) |  0.10 |   8 | 30% | 1000 |   2.43% |  1.83% |  350 |  1.74% |  250\n",
      "  9 | 07:44:13 (1211) |  0.01 |   8 | 30% | 1000 |   1.79% |  1.28% |  500 |  1.33% |  450\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-35-09269b081556>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     38\u001b[0m                               \u001b[0mtrain_set\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlgb_train\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     39\u001b[0m                               \u001b[0mnum_boost_round\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum_boost_round\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m                               verbose_eval=False)\n\u001b[0m\u001b[1;32m     41\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     42\u001b[0m                 \u001b[0mfi\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_fi\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_frame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.pyenv/versions/miniconda3-latest/envs/ml4t/lib/python3.7/site-packages/lightgbm/engine.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(params, train_set, num_boost_round, valid_sets, valid_names, fobj, feval, init_model, feature_name, categorical_feature, early_stopping_rounds, evals_result, verbose_eval, learning_rates, keep_training_booster, callbacks)\u001b[0m\n\u001b[1;32m    247\u001b[0m                                     evaluation_result_list=None))\n\u001b[1;32m    248\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m         \u001b[0mbooster\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfobj\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    250\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    251\u001b[0m         \u001b[0mevaluation_result_list\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.pyenv/versions/miniconda3-latest/envs/ml4t/lib/python3.7/site-packages/lightgbm/basic.py\u001b[0m in \u001b[0;36mupdate\u001b[0;34m(self, train_set, fobj)\u001b[0m\n\u001b[1;32m   1974\u001b[0m             _safe_call(_LIB.LGBM_BoosterUpdateOneIter(\n\u001b[1;32m   1975\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1976\u001b[0;31m                 ctypes.byref(is_finished)))\n\u001b[0m\u001b[1;32m   1977\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__is_predicted_cur_iter\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;32mFalse\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__num_dataset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1978\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mis_finished\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "for lookahead, train_length, test_length in test_params:\n",
    "    cvp = np.random.choice(list(range(n_params)),\n",
    "                           size=int(n_params / 2),\n",
    "                           replace=False)\n",
    "    cv_params_ = [cv_params[i] for i in cvp]\n",
    "\n",
    "    n_splits = int(2 * YEAR / test_length)\n",
    "    print(f'Lookahead: {lookahead:2.0f} | '\n",
    "          f'Train: {train_length:3.0f} | '\n",
    "          f'Test: {test_length:2.0f} | '\n",
    "          f'Params: {len(cv_params_):3.0f} | '\n",
    "          f'Train configs: {len(test_params)}')\n",
    "\n",
    "    cv = MultipleTimeSeriesCV(n_splits=n_splits,\n",
    "                              lookahead=lookahead,\n",
    "                              test_period_length=test_length,\n",
    "                              train_period_length=train_length)\n",
    "\n",
    "    label = label_dict[lookahead]\n",
    "    outcome_data = data.loc[:, features + [label]].dropna()\n",
    "    lgb_data = lgb.Dataset(data=outcome_data.drop(label, axis=1),\n",
    "                           label=outcome_data[label],\n",
    "                           categorical_feature=categoricals,\n",
    "                           free_raw_data=False)\n",
    "    T = 0\n",
    "    predictions, metrics, feature_importance, daily_ic = [], [], [], []\n",
    "    for p, param_vals in enumerate(cv_params_):\n",
    "        key = f'{lookahead}/{train_length}/{test_length}/' + '/'.join([str(p) for p in param_vals])\n",
    "        params = dict(zip(param_names, param_vals))\n",
    "        params.update(base_params)\n",
    "\n",
    "        start = time()\n",
    "        cv_preds, nrounds = [], []\n",
    "        ic_cv = defaultdict(list)\n",
    "        for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):\n",
    "            lgb_train = lgb_data.subset(train_idx.tolist()).construct()\n",
    "            model = lgb.train(params=params,\n",
    "                              train_set=lgb_train,\n",
    "                              num_boost_round=num_boost_round,\n",
    "                              verbose_eval=False)\n",
    "            if i == 0:\n",
    "                fi = get_fi(model).to_frame()\n",
    "            else:\n",
    "                fi[i] = get_fi(model)\n",
    "\n",
    "            test_set = outcome_data.iloc[test_idx, :]\n",
    "            X_test = test_set.loc[:, model.feature_name()]\n",
    "            y_test = test_set.loc[:, label]\n",
    "            y_pred = {str(n): model.predict(X_test, num_iteration=n) for n in num_iterations}\n",
    "            cv_preds.append(y_test.to_frame('y_test').assign(**y_pred).assign(i=i))\n",
    "        cv_preds = pd.concat(cv_preds).assign(**params)\n",
    "        predictions.append(cv_preds)\n",
    "        by_day = cv_preds.groupby(level='date')\n",
    "        ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)\n",
    "                               for n in num_iterations], axis=1)\n",
    "        daily_ic_mean = ic_by_day.mean()\n",
    "        daily_ic_mean_n = daily_ic_mean.idxmax()\n",
    "        daily_ic_median = ic_by_day.median()\n",
    "        daily_ic_median_n = daily_ic_median.idxmax()\n",
    "\n",
    "        ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0] for n in num_iterations]\n",
    "        t = time() - start\n",
    "        T += t\n",
    "        metrics = pd.Series(list(param_vals) +\n",
    "                            [t, daily_ic_mean.max(), daily_ic_mean_n, daily_ic_median.max(), daily_ic_median_n] + ic,\n",
    "                            index=metric_cols)\n",
    "        msg = f'{p:3.0f} | {format_time(T)} ({t:3.0f}) | {params[\"learning_rate\"]:5.2f} | '\n",
    "        msg += f'{params[\"num_leaves\"]:3.0f} | {params[\"feature_fraction\"]:3.0%} | {params[\"min_data_in_leaf\"]:4.0f} | '\n",
    "        msg += f' {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f}'\n",
    "        print(msg)\n",
    "\n",
    "        metrics.to_hdf(lgb_store, 'metrics/' + key)\n",
    "        ic_by_day.assign(**params).to_hdf(lgb_store, 'daily_ic/' + key)\n",
    "        fi.T.describe().T.assign(**params).to_hdf(lgb_store, 'fi/' + key)\n",
    "        cv_preds.to_hdf(lgb_store, 'predictions/' + key)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CatBoost Model Tuning"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Hyperparameter Options"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.002366Z",
     "start_time": "2020-06-21T03:17:13.861Z"
    }
   },
   "outputs": [],
   "source": [
    "param_names = ['max_depth', 'min_child_samples']\n",
    "\n",
    "max_depth_opts = [3, 5, 7, 9]\n",
    "min_child_samples_opts = [20, 250, 500]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.002915Z",
     "start_time": "2020-06-21T03:17:14.083Z"
    }
   },
   "outputs": [],
   "source": [
    "cv_params = list(product(max_depth_opts,\n",
    "                         min_child_samples_opts))\n",
    "n_params = len(cv_params)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Train/Test Period Lengths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.003547Z",
     "start_time": "2020-06-21T03:17:14.456Z"
    }
   },
   "outputs": [],
   "source": [
    "lookaheads = [1, 5, 21]\n",
    "label_dict = dict(zip(lookaheads, labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.004201Z",
     "start_time": "2020-06-21T03:17:16.556Z"
    }
   },
   "outputs": [],
   "source": [
    "train_lengths = [int(4.5 * 252), 252]\n",
    "test_lengths = [63]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.004875Z",
     "start_time": "2020-06-21T03:17:17.116Z"
    }
   },
   "outputs": [],
   "source": [
    "test_params = list(product(lookaheads,\n",
    "                           train_lengths,\n",
    "                           test_lengths))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Custom Loss Function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.005692Z",
     "start_time": "2020-06-21T03:17:18.773Z"
    }
   },
   "outputs": [],
   "source": [
    "class CatBoostIC(object):\n",
    "    \"\"\"Custom IC eval metric for CatBoost\"\"\"\n",
    "\n",
    "    def is_max_optimal(self):\n",
    "        # Returns whether great values of metric are better\n",
    "        return True\n",
    "\n",
    "    def evaluate(self, approxes, target, weight):\n",
    "        target = np.array(target)\n",
    "        approxes = np.array(approxes).reshape(-1)\n",
    "        rho = spearmanr(approxes, target)[0]\n",
    "        return rho, 1\n",
    "\n",
    "    def get_final_error(self, error, weight):\n",
    "        # Returns final value of metric based on error and weight\n",
    "        return error"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Run Cross-Validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.006427Z",
     "start_time": "2020-06-21T03:17:29.495Z"
    }
   },
   "outputs": [],
   "source": [
    "cb_store = Path(results_path / 'tuning_catboost.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.007167Z",
     "start_time": "2020-06-21T03:17:29.677Z"
    }
   },
   "outputs": [],
   "source": [
    "num_iterations = [10, 25, 50, 75] + list(range(100, 1001, 100))\n",
    "num_boost_round = num_iterations[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.007861Z",
     "start_time": "2020-06-21T03:17:29.812Z"
    }
   },
   "outputs": [],
   "source": [
    "metric_cols = (param_names + ['t', 'daily_ic_mean', 'daily_ic_mean_n',\n",
    "                              'daily_ic_median', 'daily_ic_median_n'] +\n",
    "               [str(n) for n in num_iterations])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.008624Z",
     "start_time": "2020-06-21T03:17:30.033Z"
    }
   },
   "outputs": [],
   "source": [
    "for lookahead, train_length, test_length in test_params:\n",
    "    cvp = np.random.choice(list(range(n_params)),\n",
    "                           size=int(n_params / 1),\n",
    "                           replace=False)\n",
    "    cv_params_ = [cv_params[i] for i in cvp]\n",
    "\n",
    "    n_splits = int(2 * YEAR / test_length)\n",
    "    print(f'Lookahead: {lookahead:2.0f} | Train: {train_length:3.0f} | '\n",
    "          f'Test: {test_length:2.0f} | Params: {len(cv_params_):3.0f} | Train configs: {len(test_params)}')\n",
    "    cv = MultipleTimeSeriesCV(n_splits=n_splits,\n",
    "                              lookahead=lookahead,\n",
    "                              test_period_length=test_length,\n",
    "                              train_period_length=train_length)\n",
    "\n",
    "    label = label_dict[lookahead]\n",
    "    outcome_data = data.loc[:, features + [label]].dropna()\n",
    "    cat_cols_idx = [outcome_data.columns.get_loc(c) for c in categoricals]\n",
    "    catboost_data = Pool(label=outcome_data[label],\n",
    "                         data=outcome_data.drop(label, axis=1),\n",
    "                         cat_features=cat_cols_idx)\n",
    "    predictions, metrics, feature_importance, daily_ic = [], [], [], []\n",
    "    key = f'{lookahead}/{train_length}/{test_length}'\n",
    "    T = 0\n",
    "    for p, param_vals in enumerate(cv_params_):\n",
    "        params = dict(zip(param_names, param_vals))\n",
    "        params['task_type'] = 'GPU'\n",
    "\n",
    "        start = time()\n",
    "        cv_preds, nrounds = [], []\n",
    "        ic_cv = defaultdict(list)\n",
    "        for i, (train_idx, test_idx) in enumerate(cv.split(X=outcome_data)):\n",
    "            train_set = catboost_data.slice(train_idx.tolist())\n",
    "\n",
    "            model = CatBoostRegressor(**params)\n",
    "            model.fit(X=train_set,\n",
    "                      verbose_eval=False)\n",
    "\n",
    "            test_set = outcome_data.iloc[test_idx, :]\n",
    "            X_test = test_set.loc[:, model.feature_names_]\n",
    "            y_test = test_set.loc[:, label]\n",
    "            y_pred = {str(n): model.predict(X_test, ntree_end=n) for n in num_iterations}\n",
    "            cv_preds.append(y_test.to_frame('y_test').assign(**y_pred).assign(i=i))\n",
    "        cv_preds = pd.concat(cv_preds).assign(**params)\n",
    "        predictions.append(cv_preds)\n",
    "        by_day = cv_preds.groupby(level='date')\n",
    "        ic_by_day = pd.concat([by_day.apply(lambda x: spearmanr(x.y_test, x[str(n)])[0]).to_frame(n)\n",
    "                               for n in num_iterations], axis=1)\n",
    "        daily_ic_mean = ic_by_day.mean()\n",
    "        daily_ic_mean_n = daily_ic_mean.idxmax()\n",
    "        daily_ic_median = ic_by_day.median()\n",
    "        daily_ic_median_n = daily_ic_median.idxmax()\n",
    "\n",
    "        ic = [spearmanr(cv_preds.y_test, cv_preds[str(n)])[0] for n in num_iterations]\n",
    "        t = time() - start\n",
    "        T += t\n",
    "        metrics = pd.Series(list(param_vals) +\n",
    "                            [t, daily_ic_mean.max(), daily_ic_mean_n, daily_ic_median.max(), daily_ic_median_n] + ic,\n",
    "                            index=metric_cols)\n",
    "        msg = f'{p:3.0f} | {format_time(T)} ({t:3.0f}) | {params[\"max_depth\"]:3.0f} | {params[\"min_child_samples\"]:4.0f} | '\n",
    "        msg += f' {max(ic):6.2%} | {ic_by_day.mean().max(): 6.2%} | {daily_ic_mean_n: 4.0f} | {ic_by_day.median().max(): 6.2%} | {daily_ic_median_n: 4.0f}'\n",
    "        print(msg)\n",
    "        metrics.to_hdf(cb_store, 'metrics/' + key)\n",
    "        ic_by_day.assign(**params).to_hdf(cb_store, 'daily_ic/' + key)\n",
    "        cv_preds.to_hdf(cb_store, 'predictions/' + key)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Generate LightGBM predictions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We'll generate predictions for 2016 using LightGBM; you can do so for CatBoost following the same pattern."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model Configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.009271Z",
     "start_time": "2020-06-21T03:17:33.288Z"
    }
   },
   "outputs": [],
   "source": [
    "base_params = dict(boosting='gbdt',\n",
    "                   objective='regression',\n",
    "                   verbose=-1)\n",
    "\n",
    "categoricals = ['year', 'month', 'sector', 'weekday']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.009892Z",
     "start_time": "2020-06-21T03:17:33.576Z"
    }
   },
   "outputs": [],
   "source": [
    "lookahead = 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.010579Z",
     "start_time": "2020-06-21T03:18:08.454Z"
    }
   },
   "outputs": [],
   "source": [
    "data = pd.read_hdf('data.h5', 'model_data').sort_index()\n",
    "labels = sorted(data.filter(like='_fwd').columns)\n",
    "features = data.columns.difference(labels).tolist()\n",
    "label = f'r{lookahead:02}_fwd'\n",
    "data = data.loc[idx[:, '2010':], features + [label]].dropna()\n",
    "for feature in categoricals:\n",
    "    data[feature] = pd.factorize(data[feature], sort=True)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.011175Z",
     "start_time": "2020-06-21T03:18:08.653Z"
    }
   },
   "outputs": [],
   "source": [
    "lgb_data = lgb.Dataset(data=data[features],\n",
    "                       label=data[label],\n",
    "                       categorical_feature=categoricals,\n",
    "                       free_raw_data=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generate predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.011891Z",
     "start_time": "2020-06-21T03:18:36.281Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for position in range(10):\n",
    "    params = get_params(daily_ic_avg,\n",
    "                    t=lookahead,\n",
    "                    best=position)\n",
    "    \n",
    "    params = params.to_dict()\n",
    "    \n",
    "    for p in ['min_data_in_leaf', 'num_leaves']:\n",
    "        params[p] = int(params[p])\n",
    "    train_length = int(params.pop('train_length'))\n",
    "    test_length = int(params.pop('test_length'))\n",
    "    num_boost_round = int(params.pop('boost_rounds'))\n",
    "    params.update(base_params)\n",
    "\n",
    "    print(f'\\nPosition: {position:02}')\n",
    "\n",
    "    n_splits = int(1 * YEAR / test_length)\n",
    "    cv = MultipleTimeSeriesCV(n_splits=n_splits,\n",
    "                              test_period_length=test_length,\n",
    "                              lookahead=lookahead,\n",
    "                              train_period_length=train_length)\n",
    "\n",
    "    predictions = []\n",
    "    start = time()\n",
    "    for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):\n",
    "        print(i, end=' ', flush=True)\n",
    "        lgb_train = lgb_data.subset(train_idx.tolist()).construct()\n",
    "\n",
    "        model = lgb.train(params=params,\n",
    "                          train_set=lgb_train,\n",
    "                          num_boost_round=num_boost_round,\n",
    "                          verbose_eval=False)\n",
    "\n",
    "        test_set = data.iloc[test_idx, :]\n",
    "        y_test = test_set.loc[:, label].to_frame('y_test')\n",
    "        y_pred = model.predict(test_set.loc[:, model.feature_name()])\n",
    "        predictions.append(y_test.assign(prediction=y_pred))\n",
    "\n",
    "    if position == 0:\n",
    "        test_predictions = (pd.concat(predictions)\n",
    "                            .rename(columns={'prediction': position}))\n",
    "    else:\n",
    "        test_predictions[position] = pd.concat(predictions).prediction\n",
    "\n",
    "by_day = test_predictions.groupby(level='date')\n",
    "for position in range(10):\n",
    "    if position == 0:\n",
    "        ic_by_day = by_day.apply(lambda x: spearmanr(x.y_test, x[position])[0]).to_frame()\n",
    "    else:\n",
    "        ic_by_day[position] = by_day.apply(lambda x: spearmanr(x.y_test, x[position])[0])\n",
    "print(ic_by_day.describe())\n",
    "ic_by_day.to_csv(f'ic_by_day_{lookahead:02}.csv')\n",
    "\n",
    "test_predictions.to_hdf(results_path / 'predictions.h5', f'lgb/test/{lookahead:02}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### AlphaLens Analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Prepare Factor Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.012516Z",
     "start_time": "2020-06-21T03:18:43.955Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "t = 1\n",
    "idx = pd.IndexSlice\n",
    "factor = (pd.read_hdf(results_path / 'predictions.h5', f'lgb/test/{t:02}')\n",
    "          .drop('y_test', axis=1)\n",
    "          .iloc[:, :5]\n",
    "          .mean(1)\n",
    "          .sort_index()\n",
    "          .dropna()\n",
    "          .tz_localize('UTC', level='date')\n",
    "          .swaplevel())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.013173Z",
     "start_time": "2020-06-21T03:18:44.144Z"
    }
   },
   "outputs": [],
   "source": [
    "dates = factor.index.get_level_values('date')\n",
    "dates.min(), dates.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.013972Z",
     "start_time": "2020-06-21T03:18:44.327Z"
    }
   },
   "outputs": [],
   "source": [
    "factor.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Select next available trade prices"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Using next available prices."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.014582Z",
     "start_time": "2020-06-21T03:18:47.196Z"
    }
   },
   "outputs": [],
   "source": [
    "tickers = factor.index.get_level_values('symbol').unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.015187Z",
     "start_time": "2020-06-21T03:18:47.366Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "trade_prices = get_trade_prices(tickers)\n",
    "trade_prices.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Get AlphaLens Inputs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.015831Z",
     "start_time": "2020-06-21T03:18:47.670Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "factor_data = get_clean_factor_and_forward_returns(factor=factor,\n",
    "                                                   prices=trade_prices,\n",
    "                                                   quantiles=5,\n",
    "                                                   periods=(1, 5, 10, 21))\n",
    "factor_data.sort_index().info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Summary Tearsheet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.016495Z",
     "start_time": "2020-06-21T03:18:48.171Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "create_summary_tear_sheet(factor_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Full Tearsheet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-21T11:56:42.017180Z",
     "start_time": "2020-06-21T03:18:48.873Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "create_full_tear_sheet(factor_data,\n",
    "                       long_short=True, \n",
    "                       group_neutral=False, \n",
    "                       by_group=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "292.031px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
